#!/usr/bin/python # -*- coding: UTF-8 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.0b - 29-10-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python: Main File ########################### """ Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites Aufbau einer Datenbank mit einfacher deutscher Sprache Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen Tests am 29.10.2015: https://github.com/rsennrich/clevertagger """ #https://docs.python.org/2/library/configparser.html import os import sys reload(sys) sys.path.append('/home/onetipp/python/modules') os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' import random import codecs import re import mod import stopwords import pprint from textblob_de import TextBlobDE as TextBlob from textblob_de import PatternTagger from textblob_de import TextBlobDE import treetaggerwrapper cursorMysql = mod.mysql.cursor() noDoubleHash = set() re_match = r"(\?|\.|\!)" # Match: ". WORT" # lies die Ein und Ausgabedateien outputfile = sys.argv[1] # # read file into string # #text = codecs.open(inputfile, "r", encoding='utf-8').read() # # # sent_tokenize_list = sent_tokenize(text) # # Summarize the text first and then work on it # tSumy = mod.summarizeText(text) # #tokens = mod.nltk.word_tokenize(tSumy) # tokens = mod.nltk.sent_tokenize(tSumy, language='german') # tokensRaw = mod.nltk.word_tokenize(text) changeEveryWord = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden changeEveryWordFlag = 0 changeEveryWordTemp = 0 #temporary upcount ignoreNextWord = 0 # https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper # https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py # http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/') cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;") #cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word)) result = cursorMysql.fetchall() for r in result: # r = mod.to_unicode(r[0]) if r is not None: r = mod.to_unicode(r) tags = tagger.tag_text(r) tags2 = treetaggerwrapper.make_tags(tags) pprint.pprint(tags2) # file schreiben #readabilityVar = str(mod.textstat.flesch_reading_ease(text)) # # with codecs.open(outputfile, 'w', encoding='utf-8') as f: # f.write() # f.close() mod.mysql.commit() mod.mysql.close() mod.sphinx.commit() mod.sphinx.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """